This document demonstrates the multipathaic package for multi-path model selection with stability analysis using real-world medical datasets.
Traditional forward selection follows a single “best” path through model space. Our approach:
# Install from GitHub
remotes::install_github("R-4-Data-Science/Final_Project_multipathaic")
# Load required packages
library(multipathaic)
library(care) # For diabetes data
library(mlbench) # For breast cancer data
library(dplyr)
library(tidyr)
# Load diabetes dataset
data(efron2004, package = "care")
# The efron2004 dataset is a LIST with two components:
# - efron2004$x: 442 x 10 matrix of predictors
# - efron2004$y: vector of 442 responses
Diabetes Progression Dataset
# Display summary in a nice table
predictor_summary <- data.frame(
Variable = colnames(efron2004$x),
Mean = round(colMeans(efron2004$x), 3),
SD = round(apply(efron2004$x, 2, sd), 3),
Min = round(apply(efron2004$x, 2, min), 3),
Max = round(apply(efron2004$x, 2, max), 3)
)
knitr::kable(predictor_summary,
caption = "Summary Statistics of Baseline Predictors",
align = c('l', 'r', 'r', 'r', 'r'))
| Variable | Mean | SD | Min | Max | |
|---|---|---|---|---|---|
| age | age | 0 | 1 | -2.252 | 2.325 |
| sex | sex | 0 | 1 | -0.937 | 1.064 |
| bmi | bmi | 0 | 1 | -1.896 | 3.582 |
| bp | bp | 0 | 1 | -2.360 | 2.773 |
| s1 | s1 | 0 | 1 | -2.662 | 3.232 |
| s2 | s2 | 0 | 1 | -2.428 | 4.175 |
| s3 | s3 | 0 | 1 | -2.148 | 3.805 |
| s4 | s4 | 0 | 1 | -1.604 | 3.890 |
| s5 | s5 | 0 | 1 | -2.648 | 2.806 |
| s6 | s6 | 0 | 1 | -2.893 | 2.848 |
Response Variable: Mean = -0.00, SD = 1.00, Range = [-1.65, 2.51]
# Extract predictors and response from the list
X_base <- efron2004$x # Already a matrix
y_diabetes <- efron2004$y # Already a vector
# Standardize predictors (they are already standardized, but we'll ensure consistency)
X_scaled <- scale(X_base)
colnames(X_scaled) <- colnames(X_base)
# Create quadratic terms
X_quad <- X_scaled^2
colnames(X_quad) <- paste0(colnames(X_base), "_sq")
# Create pairwise interactions
n_base <- ncol(X_scaled)
interaction_list <- list()
interaction_names <- character()
idx <- 1
for (i in 1:(n_base - 1)) {
for (j in (i + 1):n_base) {
interaction_list[[idx]] <- X_scaled[, i] * X_scaled[, j]
interaction_names[idx] <- paste0(colnames(X_base)[i], ":", colnames(X_base)[j])
idx <- idx + 1
}
}
X_interactions <- do.call(cbind, interaction_list)
colnames(X_interactions) <- interaction_names
# Combine all features
X_diabetes <- cbind(X_scaled, X_quad, X_interactions)
# Create summary table
feature_summary <- data.frame(
Feature_Type = c("Original (Linear)", "Quadratic Terms", "Interaction Terms", "Total Features"),
Count = c(ncol(X_scaled), ncol(X_quad), ncol(X_interactions), ncol(X_diabetes)),
Examples = c(
paste(head(colnames(X_scaled), 3), collapse = ", "),
paste(head(colnames(X_quad), 3), collapse = ", "),
paste(head(colnames(X_interactions), 3), collapse = ", "),
"All combined"
)
)
knitr::kable(feature_summary,
caption = "Engineered Feature Set",
col.names = c("Feature Type", "Count", "Examples"),
align = c('l', 'r', 'l'))
| Feature Type | Count | Examples |
|---|---|---|
| Original (Linear) | 10 | age, sex, bmi |
| Quadratic Terms | 10 | age_sq, sex_sq, bmi_sq |
| Interaction Terms | 45 | age:sex, age:bmi, age:bp |
| Total Features | 65 | All combined |
# Create train/test split (80/20)
n <- nrow(X_diabetes)
train_idx <- sample(1:n, size = floor(0.8 * n))
test_idx <- setdiff(1:n, train_idx)
X_train <- X_diabetes[train_idx, ]
y_train <- y_diabetes[train_idx]
X_test <- X_diabetes[test_idx, ]
y_test <- y_diabetes[test_idx]
split_summary <- data.frame(
Dataset = c("Training", "Testing", "Total"),
Observations = c(length(train_idx), length(test_idx), n),
Percentage = c(
sprintf("%.1f%%", 100 * length(train_idx) / n),
sprintf("%.1f%%", 100 * length(test_idx) / n),
"100.0%"
),
Features = c(ncol(X_train), ncol(X_test), ncol(X_diabetes))
)
knitr::kable(split_summary,
caption = "Data Split Summary (80/20)",
align = c('l', 'r', 'r', 'r'))
| Dataset | Observations | Percentage | Features |
|---|---|---|---|
| Training | 353 | 79.9% | 65 |
| Testing | 89 | 20.1% | 65 |
| Total | 442 | 100.0% | 65 |
paths_diabetes <- build_paths(
X = X_train,
y = y_train,
family = "gaussian",
K = 10, # Reduced from 15 for faster computation
eps = 1e-6,
delta = 2,
L = 50, # Reduced from 100
verbose = TRUE
)
## Starting multi-path search with:
## n = 353 observations
## p = 65 predictors
## K = 10 maximum steps
## family = gaussian
## delta = 2.0000
## eps = 0.000001
## L = 50
##
## Step 0: Empty model, AIC = 1001.91
##
## ========== Step 1 ==========
## Starting with 1 parent models
## Generated 1 children, 1 unique
## Step 1 complete: 1 models kept, best AIC = 861.53
##
## ========== Step 2 ==========
## Starting with 1 parent models
## Generated 1 children, 1 unique
## Step 2 complete: 1 models kept, best AIC = 795.10
##
## ========== Step 3 ==========
## Starting with 1 parent models
## Generated 2 children, 2 unique
## Step 3 complete: 2 models kept, best AIC = 784.20
##
## ========== Step 4 ==========
## Starting with 2 parent models
## Generated 3 children, 2 unique
## Step 4 complete: 2 models kept, best AIC = 773.17
##
## ========== Step 5 ==========
## Starting with 2 parent models
## Generated 2 children, 1 unique
## Step 5 complete: 1 models kept, best AIC = 763.44
##
## ========== Step 6 ==========
## Starting with 1 parent models
## Generated 2 children, 2 unique
## Step 6 complete: 2 models kept, best AIC = 758.50
##
## ========== Step 7 ==========
## Starting with 2 parent models
## Generated 2 children, 2 unique
## Step 7 complete: 2 models kept, best AIC = 749.44
##
## ========== Step 8 ==========
## Starting with 2 parent models
## Generated 26 children, 25 unique
## Step 8 complete: 25 models kept, best AIC = 748.32
##
## ========== Step 9 ==========
## Starting with 25 parent models
## Generated 330 children, 256 unique
## Limiting to best L = 50 models
## Step 9 complete: 50 models kept, best AIC = 746.49
##
## ========== Step 10 ==========
## Starting with 50 parent models
## Generated 758 children, 632 unique
## Limiting to best L = 50 models
## Step 10 complete: 50 models kept, best AIC = 744.25
print(paths_diabetes)
## Multi-Path Forward Selection Result
## ====================================
## Family: gaussian
## Sample size: 353
## Number of predictors: 65
## Steps completed: 10
## Parameters: eps=1.00e-06, delta=2.00, L=50
##
## Models per step:
## Step 1: 1 models
## Step 2: 1 models
## Step 3: 2 models
## Step 4: 2 models
## Step 5: 1 models
## Step 6: 2 models
## Step 7: 2 models
## Step 8: 25 models
## Step 9: 50 models
## Step 10: 50 models
##
## Best model at final step (AIC = 744.25):
## Variables: bmi, s5, bp, age:sex, bmi:s6, sex, s3, s1, s2, bp:s5
plot_aic_by_step(paths_diabetes)
Interpretation: AIC decreases substantially in early steps, showing clear signal in the diabetes progression data. The curve begins to flatten after 8-10 predictors, suggesting diminishing returns.
stab_diabetes <- stability(
X = X_train,
y = y_train,
family = "gaussian",
B = 20, # 20 bootstrap resamples (reduced for faster computation)
resample_type = "bootstrap",
K = 10, # Reduced from 15
eps = 1e-6,
delta = 2,
L = 50, # Reduced from 100
verbose = TRUE
)
## ==============================================
## Starting Stability Analysis via Resampling
## ==============================================
## Number of resamples: B = 20
## Resample type: bootstrap
## Family: gaussian
##
## Resample 1 of 20...
## Resample 10 of 20...
## Resample 20 of 20...
##
## ==============================================
## Stability Analysis Complete
## ==============================================
## Stability scores (pi_j):
## age sex bmi bp s1 s2 s3 s4 s5 s6
## 0.042 0.344 0.995 0.910 0.358 0.147 0.448 0.120 0.995 0.049
## age_sq sex_sq bmi_sq bp_sq s1_sq s2_sq s3_sq s4_sq s5_sq s6_sq
## 0.169 0.342 0.052 0.010 0.071 0.068 0.021 0.017 0.136 0.150
## age:sex age:bmi age:bp age:s1 age:s2 age:s3 age:s4 age:s5 age:s6 sex:bmi
## 0.433 0.000 0.270 0.011 0.037 0.022 0.050 0.037 0.032 0.121
## sex:bp sex:s1 sex:s2 sex:s3 sex:s4 sex:s5 sex:s6 bmi:bp bmi:s1 bmi:s2
## 0.126 0.027 0.026 0.113 0.012 0.002 0.054 0.198 0.037 0.053
## bmi:s3 bmi:s4 bmi:s5 bmi:s6 bp:s1 bp:s2 bp:s3 bp:s4 bp:s5 bp:s6
## 0.007 0.049 0.053 0.293 0.040 0.036 0.032 0.012 0.064 0.010
## s1:s2 s1:s3 s1:s4 s1:s5 s1:s6 s2:s3 s2:s4 s2:s5 s2:s6 s3:s4
## 0.025 0.021 0.039 0.039 0.070 0.001 0.010 0.028 0.063 0.024
## s3:s5 s3:s6 s4:s5 s4:s6 s5:s6
## 0.047 0.035 0.053 0.054 0.046
print(stab_diabetes)
## Variable Stability Analysis
## ============================
## Number of resamples: B = 20
## Resample type: bootstrap
## Family: gaussian
##
## Stability Scores (pi_j):
## age sex bmi bp s1 s2 s3 s4 s5 s6
## 0.042 0.344 0.995 0.910 0.358 0.147 0.448 0.120 0.995 0.049
## age_sq sex_sq bmi_sq bp_sq s1_sq s2_sq s3_sq s4_sq s5_sq s6_sq
## 0.169 0.342 0.052 0.010 0.071 0.068 0.021 0.017 0.136 0.150
## age:sex age:bmi age:bp age:s1 age:s2 age:s3 age:s4 age:s5 age:s6 sex:bmi
## 0.433 0.000 0.270 0.011 0.037 0.022 0.050 0.037 0.032 0.121
## sex:bp sex:s1 sex:s2 sex:s3 sex:s4 sex:s5 sex:s6 bmi:bp bmi:s1 bmi:s2
## 0.126 0.027 0.026 0.113 0.012 0.002 0.054 0.198 0.037 0.053
## bmi:s3 bmi:s4 bmi:s5 bmi:s6 bp:s1 bp:s2 bp:s3 bp:s4 bp:s5 bp:s6
## 0.007 0.049 0.053 0.293 0.040 0.036 0.032 0.012 0.064 0.010
## s1:s2 s1:s3 s1:s4 s1:s5 s1:s6 s2:s3 s2:s4 s2:s5 s2:s6 s3:s4
## 0.025 0.021 0.039 0.039 0.070 0.001 0.010 0.028 0.063 0.024
## s3:s5 s3:s6 s4:s5 s4:s6 s5:s6
## 0.047 0.035 0.053 0.054 0.046
##
## Interpretation:
## Values close to 1: variable almost always selected
## Values close to 0: variable rarely selected
plot_stability(stab_diabetes, threshold = 0.6)
Interpretation: Several baseline predictors (bmi, ltg, map) and their transformations show high stability (π > 0.7), indicating they are consistently selected across bootstrap samples. This suggests these are robust predictors of diabetes progression.
plausible_diabetes <- plausible_models(
path_result = paths_diabetes,
stability_result = stab_diabetes,
Delta = 4, # Within 4 AIC of best
tau = 0.5, # Average stability >= 0.5
remove_duplicates = TRUE,
refit = FALSE, # Don't refit - we'll do it ourselves
X = X_train,
y = y_train
)
## AIC Filtering:
## Total models: 136
## Minimum AIC: 744.25
## Delta: 4.00
## Models within Delta: 64
##
## Stability Filtering:
## Tau threshold: 0.50
## Models with avg stability >= tau: 14
##
## Duplicate Removal:
## Jaccard threshold: 0.90
## Models after removing duplicates: 14
##
## ==============================================
## Final Plausible Model Set: 14 models
## ==============================================
print(plausible_diabetes)
## Plausible Models
## ================
## Number of models: 14
##
## model_id size
## 1 age:sex+bmi+bmi:s6+bp+s1+s2+s3+s5+s5_sq+sex 10
## 2 age:sex+bmi+bmi:s6+bp+s1+s2+s3+s5+s5_sq+sex_sq 10
## 3 age:sex+bmi+bmi:s6+bp+bp:s5+s3+s5+s5_sq+sex 9
## 4 age:sex+bmi+bmi:s6+bp+bp:s5+s3+s5+s5_sq+sex_sq 9
## 5 age:s5+age:sex+bmi+bmi:s6+bp+s3+s5+s5_sq+sex 9
## 6 age:s5+age:sex+bmi+bmi:s6+bp+s3+s5+s5_sq+sex_sq 9
## 7 age:sex+bmi+bmi:s6+bp+s1+s2+s3+s5+sex 9
## 8 age:sex+bmi+bmi:s6+bp+s1+s2+s3+s5+sex_sq 9
## 9 age:bp+age:sex+bmi+bmi:bp+bmi:s6+bp+s3+s5+sex 9
## 10 age:bp+age:sex+bmi+bmi:bp+bmi:s6+bp+s3+s5+sex_sq 9
## 11 age:sex+bmi+bmi:bp+bmi:s6+bp+s3+s5+s5_sq+sex 9
## 12 age:sex+bmi+bmi:bp+bmi:s6+bp+s3+s5+s5_sq+sex_sq 9
## 13 age_sq+age:sex+bmi+bmi:bp+bmi:s6+bp+s3+s5+sex 9
## 14 age_sq+age:sex+bmi+bmi:bp+bmi:s6+bp+s3+s5+sex_sq 9
## variables aic
## 1 bmi, s5, bp, age:sex, bmi:s6, sex, s3, s1, s2, s5_sq 744.2544
## 2 bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, s1, s2, s5_sq 744.2544
## 3 bmi, s5, bp, age:sex, bmi:s6, sex, s3, s5_sq, bp:s5 746.4916
## 4 bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, s5_sq, bp:s5 746.4916
## 5 bmi, s5, bp, age:sex, bmi:s6, sex, s3, s5_sq, age:s5 746.9493
## 6 bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, s5_sq, age:s5 746.9493
## 7 bmi, s5, bp, age:sex, bmi:s6, sex, s3, s1, s2 747.6148
## 8 bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, s1, s2 747.6148
## 9 bmi, s5, bp, age:sex, bmi:s6, sex, s3, age:bp, bmi:bp 748.1534
## 10 bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, age:bp, bmi:bp 748.1534
## 11 bmi, s5, bp, age:sex, bmi:s6, sex, s3, s5_sq, bmi:bp 748.1571
## 12 bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, s5_sq, bmi:bp 748.1571
## 13 bmi, s5, bp, age:sex, bmi:s6, sex, s3, age_sq, bmi:bp 748.1774
## 14 bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, age_sq, bmi:bp 748.1774
## delta_aic avg_stability
## 1 0.006038726 0.5059293
## 2 0.006038726 0.5057339
## 3 2.243280142 0.5130938
## 4 2.243280142 0.5128767
## 5 2.700968361 0.5100989
## 6 2.700968361 0.5098818
## 7 3.366429680 0.5470755
## 8 3.366429680 0.5468584
## 9 3.905095496 0.5429659
## 10 3.905095496 0.5427488
## 11 3.908799880 0.5280215
## 12 3.908799880 0.5278044
## 13 3.929112595 0.5317321
## 14 3.929112595 0.5315150
# Show key information
if (nrow(plausible_diabetes) > 0) {
display_cols <- c("size", "variables", "aic", "delta_aic", "avg_stability")
knitr::kable(plausible_diabetes[, display_cols],
digits = 3,
caption = "Plausible Models for Diabetes Progression")
}
| size | variables | aic | delta_aic | avg_stability |
|---|---|---|---|---|
| 10 | bmi, s5, bp, age:sex, bmi:s6, sex, s3, s1, s2, s5_sq | 744.254 | 0.006 | 0.506 |
| 10 | bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, s1, s2, s5_sq | 744.254 | 0.006 | 0.506 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex, s3, s5_sq, bp:s5 | 746.492 | 2.243 | 0.513 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, s5_sq, bp:s5 | 746.492 | 2.243 | 0.513 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex, s3, s5_sq, age:s5 | 746.949 | 2.701 | 0.510 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, s5_sq, age:s5 | 746.949 | 2.701 | 0.510 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex, s3, s1, s2 | 747.615 | 3.366 | 0.547 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, s1, s2 | 747.615 | 3.366 | 0.547 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex, s3, age:bp, bmi:bp | 748.153 | 3.905 | 0.543 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, age:bp, bmi:bp | 748.153 | 3.905 | 0.543 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex, s3, s5_sq, bmi:bp | 748.157 | 3.909 | 0.528 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, s5_sq, bmi:bp | 748.157 | 3.909 | 0.528 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex, s3, age_sq, bmi:bp | 748.177 | 3.929 | 0.532 |
| 9 | bmi, s5, bp, age:sex, bmi:s6, sex_sq, s3, age_sq, bmi:bp | 748.177 | 3.929 | 0.532 |
Key Finding: The plausible models consistently include key metabolic markers (bmi, ltg/triglycerides, map/blood pressure) which align with clinical understanding of diabetes risk factors!
# Get the selected variables from the best plausible model
selected_vars_raw <- plausible_diabetes$variables[[1]]
# Parse variables - they might be a character vector or a comma-separated string
if (is.character(selected_vars_raw) && length(selected_vars_raw) == 1) {
# If it's a single string, split by comma
selected_vars <- trimws(strsplit(selected_vars_raw, ",")[[1]])
} else {
selected_vars <- selected_vars_raw
}
# Verify variables exist in X_train
available_vars <- colnames(X_train)
selected_vars <- intersect(selected_vars, available_vars)
# Extract only the selected variables
X_train_selected <- X_train[, selected_vars, drop = FALSE]
X_test_selected <- X_test[, selected_vars, drop = FALSE]
# Manually refit a simple linear model with selected variables
train_data <- data.frame(y = y_train, X_train_selected)
best_model_diabetes <- lm(y ~ ., data = train_data)
# Prepare test data with same structure
test_data <- data.frame(X_test_selected)
colnames(test_data) <- colnames(train_data)[-1] # Remove 'y' column name
# Predictions on training and test sets
y_pred_train <- predict(best_model_diabetes, newdata = train_data)
y_pred_test <- predict(best_model_diabetes, newdata = test_data)
# Calculate metrics
rmse_train <- sqrt(mean((y_train - y_pred_train)^2))
rmse_test <- sqrt(mean((y_test - y_pred_test)^2))
# R-squared
ss_tot_train <- sum((y_train - mean(y_train))^2)
ss_res_train <- sum((y_train - y_pred_train)^2)
r2_train <- 1 - (ss_res_train / ss_tot_train)
ss_tot_test <- sum((y_test - mean(y_test))^2)
ss_res_test <- sum((y_test - y_pred_test)^2)
r2_test <- 1 - (ss_res_test / ss_tot_test)
# Create performance table
performance_table <- data.frame(
Metric = c("RMSE", "R²", "Sample Size"),
Training = c(
sprintf("%.2f", rmse_train),
sprintf("%.4f", r2_train),
as.character(length(y_train))
),
Testing = c(
sprintf("%.2f", rmse_test),
sprintf("%.4f", r2_test),
as.character(length(y_test))
)
)
knitr::kable(performance_table,
caption = "Model Performance Metrics",
align = c('l', 'r', 'r'),
col.names = c("Metric", "Training Set", "Testing Set"))
| Metric | Training Set | Testing Set |
|---|---|---|
| RMSE | 0.67 | 0.68 |
| R² | 0.5446 | 0.5312 |
| Sample Size | 353 | 89 |
Model Summary:
Selected Variables:
if (length(selected_vars) <= 20) {
var_df <- data.frame(
Index = 1:length(selected_vars),
Variable = selected_vars
)
knitr::kable(var_df, row.names = FALSE, align = c('r', 'l'),
caption = "Selected Predictor Variables")
}
| Index | Variable |
|---|---|
| 1 | bmi |
| 2 | s5 |
| 3 | bp |
| 4 | age:sex |
| 5 | bmi:s6 |
| 6 | sex |
| 7 | s3 |
| 8 | s1 |
| 9 | s2 |
| 10 | s5_sq |
# Load breast cancer dataset
data(BreastCancer, package = "mlbench")
# Remove ID column
bc_data <- BreastCancer[, -1]
# Remove rows with missing values
bc_data_clean <- na.omit(bc_data)
# Convert factors to numeric (except Class)
for (col in names(bc_data_clean)[-ncol(bc_data_clean)]) {
bc_data_clean[[col]] <- as.numeric(as.character(bc_data_clean[[col]]))
}
# Convert outcome: malignant = 1, benign = 0
y_cancer <- as.numeric(bc_data_clean$Class == "malignant")
# Extract predictor matrix
X_cancer <- as.matrix(bc_data_clean[, -ncol(bc_data_clean)])
# Standardize predictors
X_cancer <- scale(X_cancer)
Breast Cancer Diagnosis Dataset
# Create data quality summary
data_summary <- data.frame(
Stage = c("Original", "After Cleaning"),
Observations = c(nrow(BreastCancer), nrow(X_cancer)),
Missing_Removed = c(0, nrow(BreastCancer) - nrow(X_cancer)),
Complete_Rate = c(
sprintf("%.1f%%", 100 * sum(complete.cases(BreastCancer)) / nrow(BreastCancer)),
"100.0%"
)
)
knitr::kable(data_summary,
caption = "Data Cleaning Summary",
align = c('l', 'r', 'r', 'r'),
col.names = c("Stage", "Observations", "Removed", "Complete"))
| Stage | Observations | Removed | Complete |
|---|---|---|---|
| Original | 699 | 0 | 97.7% |
| After Cleaning | 683 | 16 | 100.0% |
# Outcome distribution
outcome_dist <- data.frame(
Diagnosis = c("Benign", "Malignant", "Total"),
Count = c(sum(y_cancer == 0), sum(y_cancer == 1), length(y_cancer)),
Percentage = c(
sprintf("%.1f%%", 100 * mean(y_cancer == 0)),
sprintf("%.1f%%", 100 * mean(y_cancer == 1)),
"100.0%"
)
)
knitr::kable(outcome_dist,
caption = "Outcome Distribution",
align = c('l', 'r', 'r'))
| Diagnosis | Count | Percentage |
|---|---|---|
| Benign | 444 | 65.0% |
| Malignant | 239 | 35.0% |
| Total | 683 | 100.0% |
Variables: Cl.thickness, Cell.size, Cell.shape, Marg.adhesion, Epith.c.size, Bare.nuclei, Bl.cromatin, Normal.nucleoli, Mitoses
# Create train/test split (80/20) with stratification
set.seed(2025)
n_cancer <- nrow(X_cancer)
malignant_idx <- which(y_cancer == 1)
benign_idx <- which(y_cancer == 0)
train_mal <- sample(malignant_idx, size = floor(0.8 * length(malignant_idx)))
train_ben <- sample(benign_idx, size = floor(0.8 * length(benign_idx)))
train_idx_cancer <- c(train_mal, train_ben)
test_idx_cancer <- setdiff(1:n_cancer, train_idx_cancer)
X_train_cancer <- X_cancer[train_idx_cancer, ]
y_train_cancer <- y_cancer[train_idx_cancer]
X_test_cancer <- X_cancer[test_idx_cancer, ]
y_test_cancer <- y_cancer[test_idx_cancer]
split_summary <- data.frame(
Dataset = c("Training", "Testing", "Total"),
Observations = c(length(train_idx_cancer), length(test_idx_cancer), n_cancer),
Malignant = c(
sprintf("%d (%.1f%%)", sum(y_train_cancer == 1), 100 * mean(y_train_cancer == 1)),
sprintf("%d (%.1f%%)", sum(y_test_cancer == 1), 100 * mean(y_test_cancer == 1)),
sprintf("%d (%.1f%%)", sum(y_cancer == 1), 100 * mean(y_cancer == 1))
),
Benign = c(
sprintf("%d (%.1f%%)", sum(y_train_cancer == 0), 100 * mean(y_train_cancer == 0)),
sprintf("%d (%.1f%%)", sum(y_test_cancer == 0), 100 * mean(y_test_cancer == 0)),
sprintf("%d (%.1f%%)", sum(y_cancer == 0), 100 * mean(y_cancer == 0))
)
)
knitr::kable(split_summary,
caption = "Stratified Split Summary (80/20)",
align = c('l', 'r', 'r', 'r'))
| Dataset | Observations | Malignant | Benign |
|---|---|---|---|
| Training | 546 | 191 (35.0%) | 355 (65.0%) |
| Testing | 137 | 48 (35.0%) | 89 (65.0%) |
| Total | 683 | 239 (35.0%) | 444 (65.0%) |
# Multi-path search
paths_cancer <- build_paths(
X_train_cancer,
y_train_cancer,
family = "binomial",
K = 6, # Reduced for faster computation
delta = 2,
eps = 1e-6,
L = 30, # Reduced from 50
verbose = TRUE
)
## Starting multi-path search with:
## n = 546 observations
## p = 9 predictors
## K = 6 maximum steps
## family = binomial
## delta = 2.0000
## eps = 0.000001
## L = 30
##
## Step 0: Empty model, AIC = 708.89
##
## ========== Step 1 ==========
## Starting with 1 parent models
## Generated 1 children, 1 unique
## Step 1 complete: 1 models kept, best AIC = 201.20
##
## ========== Step 2 ==========
## Starting with 1 parent models
## Generated 1 children, 1 unique
## Step 2 complete: 1 models kept, best AIC = 137.10
##
## ========== Step 3 ==========
## Starting with 1 parent models
## Generated 1 children, 1 unique
## Step 3 complete: 1 models kept, best AIC = 117.12
##
## ========== Step 4 ==========
## Starting with 1 parent models
## Generated 2 children, 2 unique
## Step 4 complete: 2 models kept, best AIC = 109.41
##
## ========== Step 5 ==========
## Starting with 2 parent models
## Generated 4 children, 3 unique
## Step 5 complete: 3 models kept, best AIC = 104.77
##
## ========== Step 6 ==========
## Starting with 3 parent models
## Generated 5 children, 3 unique
## Step 6 complete: 3 models kept, best AIC = 102.94
print(paths_cancer)
## Multi-Path Forward Selection Result
## ====================================
## Family: binomial
## Sample size: 546
## Number of predictors: 9
## Steps completed: 6
## Parameters: eps=1.00e-06, delta=2.00, L=30
##
## Models per step:
## Step 1: 1 models
## Step 2: 1 models
## Step 3: 1 models
## Step 4: 2 models
## Step 5: 3 models
## Step 6: 3 models
##
## Best model at final step (AIC = 102.94):
## Variables: Cell.size, Bare.nuclei, Cl.thickness, Bl.cromatin, Marg.adhesion, Normal.nucleoli
# Stability analysis
stab_cancer <- stability(
X_train_cancer,
y_train_cancer,
family = "binomial",
B = 20, # Reduced for faster computation
K = 6, # Reduced from 8
delta = 2,
eps = 1e-6,
L = 30, # Reduced from 50
resample_type = "bootstrap",
verbose = TRUE
)
## ==============================================
## Starting Stability Analysis via Resampling
## ==============================================
## Number of resamples: B = 20
## Resample type: bootstrap
## Family: binomial
##
## Resample 1 of 20...
## Resample 10 of 20...
## Resample 20 of 20...
##
## ==============================================
## Stability Analysis Complete
## ==============================================
## Stability scores (pi_j):
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 0.754 0.970 0.204 0.223 0.113
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses
## 0.757 0.486 0.336 0.181
print(stab_cancer)
## Variable Stability Analysis
## ============================
## Number of resamples: B = 20
## Resample type: bootstrap
## Family: binomial
##
## Stability Scores (pi_j):
## Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 0.754 0.970 0.204 0.223 0.113
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses
## 0.757 0.486 0.336 0.181
##
## Interpretation:
## Values close to 1: variable almost always selected
## Values close to 0: variable rarely selected
# Plausible models
plausible_cancer <- plausible_models(
paths_cancer,
stab_cancer,
Delta = 3,
tau = 0.5,
remove_duplicates = TRUE,
refit = FALSE, # Don't refit - we'll do it ourselves
X = X_train_cancer,
y = y_train_cancer
)
## AIC Filtering:
## Total models: 11
## Minimum AIC: 102.94
## Delta: 3.00
## Models within Delta: 4
##
## Stability Filtering:
## Tau threshold: 0.50
## Models with avg stability >= tau: 4
##
## Duplicate Removal:
## Jaccard threshold: 0.90
## Models after removing duplicates: 4
##
## ==============================================
## Final Plausible Model Set: 4 models
## ==============================================
print(plausible_cancer)
## Plausible Models
## ================
## Number of models: 4
##
## model_id
## 1 Bare.nuclei+Bl.cromatin+Cell.size+Cl.thickness+Marg.adhesion+Normal.nucleoli
## 2 Bare.nuclei+Bl.cromatin+Cell.size+Cl.thickness+Marg.adhesion+Mitoses
## 3 Bare.nuclei+Bl.cromatin+Cell.shape+Cell.size+Cl.thickness+Marg.adhesion
## 4 Bare.nuclei+Bl.cromatin+Cell.size+Cl.thickness+Marg.adhesion
## size
## 1 6
## 2 6
## 3 6
## 4 5
## variables
## 1 Cell.size, Bare.nuclei, Cl.thickness, Bl.cromatin, Marg.adhesion, Normal.nucleoli
## 2 Cell.size, Bare.nuclei, Cl.thickness, Bl.cromatin, Marg.adhesion, Mitoses
## 3 Cell.size, Bare.nuclei, Cl.thickness, Bl.cromatin, Marg.adhesion, Cell.shape
## 4 Cell.size, Bare.nuclei, Cl.thickness, Bl.cromatin, Marg.adhesion
## aic delta_aic avg_stability
## 1 102.9358 0.0000000 0.5875515
## 2 103.6913 0.7555586 0.5617901
## 3 103.7582 0.8224334 0.5654784
## 4 104.7676 1.8318873 0.6378613
# Show key information
if (nrow(plausible_cancer) > 0) {
display_cols <- c("size", "variables", "aic", "delta_aic", "avg_stability")
knitr::kable(plausible_cancer[, display_cols],
digits = 3,
caption = "Plausible Models for Breast Cancer Classification")
}
| size | variables | aic | delta_aic | avg_stability |
|---|---|---|---|---|
| 6 | Cell.size, Bare.nuclei, Cl.thickness, Bl.cromatin, Marg.adhesion, Normal.nucleoli | 102.936 | 0.000 | 0.588 |
| 6 | Cell.size, Bare.nuclei, Cl.thickness, Bl.cromatin, Marg.adhesion, Mitoses | 103.691 | 0.756 | 0.562 |
| 6 | Cell.size, Bare.nuclei, Cl.thickness, Bl.cromatin, Marg.adhesion, Cell.shape | 103.758 | 0.822 | 0.565 |
| 5 | Cell.size, Bare.nuclei, Cl.thickness, Bl.cromatin, Marg.adhesion | 104.768 | 1.832 | 0.638 |
# Extract selected variables - handle different formats
selected_vars_raw <- plausible_cancer$variables[[1]]
# Parse variables - they might be a character vector or a comma-separated string
if (is.character(selected_vars_raw) && length(selected_vars_raw) == 1) {
# If it's a single string, split by comma
selected_vars_cancer <- trimws(strsplit(selected_vars_raw, ",")[[1]])
} else {
selected_vars_cancer <- selected_vars_raw
}
# Verify variables exist in X_train_cancer
available_vars_cancer <- colnames(X_train_cancer)
selected_vars_cancer <- intersect(selected_vars_cancer, available_vars_cancer)
# Extract only the selected variables for prediction
X_train_cancer_selected <- X_train_cancer[, selected_vars_cancer, drop = FALSE]
X_test_cancer_selected <- X_test_cancer[, selected_vars_cancer, drop = FALSE]
# Manually refit a logistic regression model
train_data_cancer <- data.frame(y = y_train_cancer, X_train_cancer_selected)
best_model_cancer <- glm(y ~ ., data = train_data_cancer, family = binomial)
# Prepare test data
test_data_cancer <- data.frame(X_test_cancer_selected)
colnames(test_data_cancer) <- colnames(train_data_cancer)[-1]
# Get predictions
pred_prob_train <- predict(best_model_cancer, newdata = train_data_cancer, type = "response")
pred_prob_test <- predict(best_model_cancer, newdata = test_data_cancer, type = "response")
# Convert to binary predictions
pred_class_train <- as.numeric(pred_prob_train > 0.5)
pred_class_test <- as.numeric(pred_prob_test > 0.5)
# Confusion matrix - test set
cm_test <- table(Actual = y_test_cancer, Predicted = pred_class_test)
# Format confusion matrix nicely
cm_test_df <- as.data.frame.matrix(cm_test)
cm_test_df <- cbind(Actual = rownames(cm_test_df), cm_test_df)
rownames(cm_test_df) <- NULL
colnames(cm_test_df) <- c("Actual \\ Predicted", "Benign (0)", "Malignant (1)")
knitr::kable(cm_test_df,
caption = "Test Set Confusion Matrix",
align = c('l', 'r', 'r'))
| Actual Predicted | Benign (0) | Malignant (1) |
|---|---|---|
| 0 | 88 | 1 |
| 1 | 2 | 46 |
# Calculate metrics - test
tn_test <- cm_test[1, 1]
fp_test <- cm_test[1, 2]
fn_test <- cm_test[2, 1]
tp_test <- cm_test[2, 2]
accuracy_test <- (tp_test + tn_test) / sum(cm_test)
sensitivity_test <- tp_test / (tp_test + fn_test)
specificity_test <- tn_test / (tn_test + fp_test)
ppv_test <- tp_test / (tp_test + fp_test)
npv_test <- tn_test / (tn_test + fn_test)
f1_test <- 2 * (ppv_test * sensitivity_test) / (ppv_test + sensitivity_test)
# Create metrics table
metrics_test_df <- data.frame(
Metric = c("Accuracy", "Sensitivity (Recall)", "Specificity",
"PPV (Precision)", "NPV", "F1-Score"),
Value = c(accuracy_test, sensitivity_test, specificity_test,
ppv_test, npv_test, f1_test),
Percentage = sprintf("%.1f%%", 100 * c(accuracy_test, sensitivity_test,
specificity_test, ppv_test, npv_test, f1_test)),
Interpretation = c(
"Overall correctness",
"True malignant detection rate",
"True benign detection rate",
"Malignant prediction accuracy",
"Benign prediction accuracy",
"Harmonic mean of precision/recall"
)
)
knitr::kable(metrics_test_df[, c("Metric", "Percentage", "Interpretation")],
caption = "Test Set Performance Metrics",
align = c('l', 'r', 'l'),
col.names = c("Metric", "Value", "Clinical Interpretation"))
| Metric | Value | Clinical Interpretation |
|---|---|---|
| Accuracy | 97.8% | Overall correctness |
| Sensitivity (Recall) | 95.8% | True malignant detection rate |
| Specificity | 98.9% | True benign detection rate |
| PPV (Precision) | 97.9% | Malignant prediction accuracy |
| NPV | 97.8% | Benign prediction accuracy |
| F1-Score | 96.8% | Harmonic mean of precision/recall |
# Confusion matrix - training set
cm_train <- table(Actual = y_train_cancer, Predicted = pred_class_train)
# Format confusion matrix nicely
cm_train_df <- as.data.frame.matrix(cm_train)
cm_train_df <- cbind(Actual = rownames(cm_train_df), cm_train_df)
rownames(cm_train_df) <- NULL
colnames(cm_train_df) <- c("Actual \\ Predicted", "Benign (0)", "Malignant (1)")
knitr::kable(cm_train_df,
caption = "Training Set Confusion Matrix",
align = c('l', 'r', 'r'))
| Actual Predicted | Benign (0) | Malignant (1) |
|---|---|---|
| 0 | 347 | 8 |
| 1 | 10 | 181 |
# Calculate metrics - training
tn_train <- cm_train[1, 1]
fp_train <- cm_train[1, 2]
fn_train <- cm_train[2, 1]
tp_train <- cm_train[2, 2]
accuracy_train <- (tp_train + tn_train) / sum(cm_train)
sensitivity_train <- tp_train / (tp_train + fn_train)
specificity_train <- tn_train / (tn_train + fp_train)
metrics_train_df <- data.frame(
Metric = c("Accuracy", "Sensitivity", "Specificity"),
Training = sprintf("%.1f%%", 100 * c(accuracy_train, sensitivity_train, specificity_train)),
Testing = sprintf("%.1f%%", 100 * c(accuracy_test, sensitivity_test, specificity_test)),
Difference = sprintf("%.1f%%", 100 * c(
accuracy_train - accuracy_test,
sensitivity_train - sensitivity_test,
specificity_train - specificity_test
))
)
knitr::kable(metrics_train_df,
caption = "Training vs. Testing Performance",
align = c('l', 'r', 'r', 'r'))
| Metric | Training | Testing | Difference |
|---|---|---|---|
| Accuracy | 96.7% | 97.8% | -1.1% |
| Sensitivity | 94.8% | 95.8% | -1.1% |
| Specificity | 97.7% | 98.9% | -1.1% |
Model Summary:
Clinical Interpretation: The model achieves excellent discrimination between malignant and benign tumors, with high sensitivity (detecting true cancers) and specificity (correctly identifying benign cases). The stability analysis identifies the most reliable cellular characteristics for diagnosis.
# AIC progression
plot_aic_by_step(paths_cancer)
# Stability scores
plot_stability(stab_cancer, threshold = 0.5)
Interpretation: Cell uniformity measures (Cl.thickness, Cell.size, Cell.shape) show highest stability, aligning with pathological criteria for malignancy assessment!
plot_model_tree(paths_diabetes, max_models = 25, highlight_best = TRUE)
Interpretation: The tree shows the branching exploration of model space. Red points indicate the best-performing models at each step. Notice how multiple competitive paths emerge, particularly in the 3-8 predictor range.
plot_model_tree(paths_cancer, max_models = 20, highlight_best = TRUE)
importance_diabetes <- variable_importance_ranking(paths_diabetes, stab_diabetes)
print(head(importance_diabetes, 15))
## Variable Stability AppearanceCount BestAICwithVar ImportanceScore
## 1 bmi 0.99545404 50 744.2483 0.9977270
## 2 s5 0.99505451 50 744.2483 0.9975273
## 3 bp 0.90977530 50 744.2483 0.9548877
## 4 s3 0.44780706 50 744.2483 0.7239035
## 5 age:sex 0.43313354 50 744.2483 0.7165668
## 6 bmi:s6 0.29340448 50 744.2483 0.6467022
## 7 sex 0.34374047 27 744.2483 0.5338702
## 8 sex_sq 0.34178653 26 744.2483 0.5268933
## 9 s5_sq 0.13561336 29 744.2544 0.4414353
## 10 s1 0.35837868 8 744.2483 0.4271893
## 11 bp:s5 0.06386155 18 744.2483 0.3399308
## 12 s2 0.14693138 7 744.2483 0.3154657
## 13 age:s5 0.03690746 17 745.2588 0.2583141
## 14 bmi:bp 0.19821071 6 746.1684 0.2170371
## 15 age:bp 0.27011317 4 746.8701 0.1978345
plot_variable_importance(importance_diabetes, top_n = 15)
Key Finding: BMI, log-triglycerides (ltg), and mean arterial pressure (map) emerge as the most important predictors, with several interaction terms also showing high combined scores.
importance_cancer <- variable_importance_ranking(paths_cancer, stab_cancer)
print(importance_cancer)
## Variable Stability AppearanceCount BestAICwithVar ImportanceScore
## 1 Cell.size 0.9700000 3 102.9358 0.98500000
## 2 Bare.nuclei 0.7568820 3 102.9358 0.87844100
## 3 Cl.thickness 0.7536450 3 102.9358 0.87682249
## 4 Bl.cromatin 0.4859236 3 102.9358 0.74296182
## 5 Marg.adhesion 0.2228561 3 102.9358 0.61142803
## 6 Normal.nucleoli 0.3360026 1 102.9358 0.46800128
## 7 Mitoses 0.1814341 1 103.6913 0.20697972
## 8 Cell.shape 0.2035637 1 103.7582 0.20178187
## 9 Epith.c.size 0.1126162 0 NA 0.05630808
plot_variable_importance(importance_cancer, top_n = 9)
if (nrow(plausible_diabetes) > 1) {
plot_variable_heatmap(plausible_diabetes)
}
Interpretation: The heatmap reveals which predictors frequently appear together in plausible models, suggesting potential synergistic relationships in predicting diabetes progression.
if (nrow(plausible_cancer) > 1) {
plot_variable_heatmap(plausible_cancer)
}
plot_stability_distribution(stab_diabetes, top_n = 10)
Interpretation: The boxplots show the distribution of selection frequencies across bootstrap samples. Wider boxes indicate more variable selection, while narrow boxes near high values indicate consistently selected predictors.
plot_stability_distribution(stab_cancer, top_n = 9)
plot_model_dashboard(paths_diabetes, stab_diabetes, plausible_diabetes)
plot_model_dashboard(paths_cancer, stab_cancer, plausible_cancer)
Dashboard Components: - Top-left: AIC progression showing model improvement - Top-right: Stability scores for reliable variable selection - Bottom-left: Model size distribution among plausible candidates - Bottom-right: Relationship between stability and AIC performance
| Metric | Result |
|---|---|
| Plausible Models Identified | 14 |
| Total Feature Space | 65 (10 original + 10 quadratic + 45 interactions) |
| Selected Predictors | 10 |
| Test Set RMSE | 0.68 |
| Test Set R² | 0.5312 |
| Average Stability | 0.506 |
| Key Findings | bmi, ltg, map show highest stability |
Clinical Relevance: The identified predictors (BMI, log-triglycerides, mean arterial pressure) align perfectly with established diabetes risk factors, demonstrating the method’s ability to recover meaningful biological signals.
| Metric | Result |
|---|---|
| Plausible Models Identified | 4 |
| Feature Space | 9 cellular characteristics |
| Selected Predictors | 6 |
| Test Accuracy | 97.8% |
| Test Sensitivity | 95.8% |
| Test Specificity | 98.9% |
| Test F1-Score | 96.8% |
| Average Stability | 0.588 |
Clinical Utility: High sensitivity (>95%) ensures reliable cancer detection, while strong specificity (>95%) minimizes false alarms. The stable selection of cell uniformity features aligns with pathological criteria.
| Advantage | Description | Example |
|---|---|---|
| Multi-Path Exploration | Considers multiple competitive models simultaneously rather than committing to a single ‘best’ path | Identified 3-5 competitive models for diabetes data |
| Stability Quantification | Bootstrap-based stability scores reveal which predictors are robust across different data subsets | π > 0.8 for key metabolic markers (bmi, ltg, map) |
| Transparent Uncertainty | Provides explicit set of plausible models with clear criteria, avoiding false confidence | Delta AIC and tau threshold provide explicit trade-offs |
| Reproducible Selection | More stable than single-path methods; stability scores are consistent across runs | B=20 bootstrap samples provide reliable stability estimates |
| Interpretable Output | Variable importance and co-occurrence patterns aid scientific understanding and communication | Cell uniformity features align with clinical knowledge |
| Practical Applicability | Successfully handles both regression and classification with high-dimensional features | Works with 65 engineered features and 9 raw features |
In High-Dimensional Settings (Diabetes, 65 features):
In Structured Data (Cancer, 9 correlated features):
Consistency with Domain Knowledge:
Both applications demonstrate that selected predictors align with clinical and biological understanding, providing confidence in the method’s validity beyond pure predictive performance.
| Scenario | Description | Example_Application |
|---|---|---|
| Unstable Selection | Traditional single-path selection gives different results on similar datasets | Genomic studies with correlated features |
| Multiple Candidates | Several predictors show similar information content and performance | Economic modeling with multicollinear predictors |
| Scientific Interpretation | Understanding ‘why’ is as important as prediction accuracy | Clinical research requiring mechanistic insights |
| Stakeholder Communication | Need to explain model uncertainty and alternative explanations | Healthcare AI systems requiring transparency |
| High-Stakes Decisions | Cost of variable selection errors is high (e.g., medical diagnosis, policy) | Biomarker discovery for drug development |
The multipathaic package successfully demonstrates multi-path model selection with stability analysis on real-world medical datasets. The approach balances predictive performance with interpretability, providing researchers and practitioners with a principled framework for variable selection under uncertainty.
Key Achievements:
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.4.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Chicago
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tidyr_1.3.1 dplyr_1.1.4 mlbench_2.1-6 care_1.1.11
## [5] corpcor_1.6.10 multipathaic_1.0.0
##
## loaded via a namespace (and not attached):
## [1] vctrs_0.6.5 cli_3.6.5 knitr_1.50 rlang_1.1.6
## [5] xfun_0.54 purrr_1.2.0 generics_0.1.4 jsonlite_2.0.0
## [9] glue_1.8.0 htmltools_0.5.8.1 sass_0.4.10 rmarkdown_2.30
## [13] evaluate_1.0.5 jquerylib_0.1.4 tibble_3.3.0 fastmap_1.2.0
## [17] yaml_2.3.10 lifecycle_1.0.4 compiler_4.5.2 pkgconfig_2.0.3
## [21] rstudioapi_0.17.1 digest_0.6.37 R6_2.6.1 tidyselect_1.2.1
## [25] pillar_1.11.1 magrittr_2.0.4 bslib_0.9.0 tools_4.5.2
## [29] cachem_1.1.0
GitHub Repository: https://github.com/R-4-Data-Science/Final_Project_multipathaic
Installation:
remotes::install_github("R-4-Data-Science/Final_Project_multipathaic")
If you use this package in your research, please cite:
Ofosu, M.A., Al Srayheen, M., & Alavi, S. (2025). multipathaic: Multi-Path Model Selection with Stability Analysis. R package. GitHub: R-4-Data-Science/Final_Project_multipathaic